##########################################
# Replication Data for Proksch, Lowe, Wäckerle, Soroka. (2018). Multilingual Sentiment Analysis: A New Approach to Measuring Conflict in Legislative Speeches. Legislative Studies Quarterly, Forthcoming.
##########################################

#Part 3: Wordscores Replication
#Most of this code follows the replication provided by Herzog and Benoit for the article "The Most Unkindest Cuts: Speaker Selection and Expressed Government Dissent During Economic Crisis"

rm(list = ls(all = TRUE))
library(rstudioapi)

current_path <- getActiveDocumentContext()$path 
setwd(dirname(current_path ))
### PATHS ########################################
dataInPath <- "3_master_data.RData"
dataOutAll <- "3_working_data_all.RData"
dataOutAll_senti <- "3_working_data_all_senti.RData"
dataOutSub <- "3_working_data_speakers.RData"
dataOutSub_senti <- "3_working_data_speakers_senti.RData"
##################################################

load(dataInPath)

# ==================
# = Case selection =
# ==================
data_senti=data
# remove finance ministers and opposition spokesperson
data <- data[-which(data$finance_minister==TRUE),] 
data <- data[-which(data$opposition_spokesperson==TRUE),]

# remove Cheann Comhairle and Leas Cheann Comhairle (for safety)
data <- data[!(data$position %in% c("Cheann Comhairle","Leas Cheann Comhairle")),]
data$position <- factor(data$position)
data_senti <- data_senti[!(data_senti$position %in% c("Cheann Comhairle","Leas Cheann Comhairle")),]
data_senti$position <- factor(data_senti$position)


# remove those who were not elected yet (by-elections)
data <- data[data$notElectedYet==0,]
data_senti <- data_senti[data_senti$notElectedYet==0,]

# remove those who resigned or died in office
data <- data[data$removed==0,]
data_senti <- data_senti[data_senti$removed==0,]

# remove cases with missing values
# Note: election data is missing for Ceann Comhairle, who are automatically re-elected to the Dail. Because we exclude them from the analysis, missing values can be savely removed
data <- data[!is.na(data$first_preference_votes),]
data <- data[!is.na(data$quota),]
data_senti <- data_senti[!is.na(data_senti$first_preference_votes),]
data_senti <- data_senti[!is.na(data_senti$quota),]


# ====================
# = Variable coding =
# ====================
# who spoke?
data$spoke <- 0
data$spoke[!is.na(data$textscore)] <- 1
data_senti$spoke <- 0
data_senti$spoke[!is.na(data_senti$textscore)] <- 1

# social need measure (proportion of constituents on the live register)
# - in current year
data$lr_prop_current_year <- data$lr_abs / data$population
data_senti$lr_prop_current_year <- data_senti$lr_abs / data_senti$population

# - in previous year
data$lr_prop_previous_year <- data$lr_abs_lag / data$population
data_senti$lr_prop_previous_year <- data_senti$lr_abs_lag / data_senti$population


# adjust for change in debate year
# Note: unemployment data was merged by debate year. 
# - for budgets 1987-1997, debates were held in January or February of
#   the same year's budget. Hence we use previous years unemployment
#   rate.
# - for budgets 1998-2013, debates were held in December for the next
#   year's budget. Hence we use the debate years unemployment rate

data$lr_prop <- NA
data_senti$lr_prop <- NA

data$lr_prop[data$budget_year<=1997] <- data$lr_prop_previous_year[data$budget_year<=1997]
data_senti$lr_prop[data_senti$budget_year<=1997] <- data_senti$lr_prop_previous_year[data_senti$budget_year<=1997]

data$lr_prop[data$budget_year>=1998] <- data$lr_prop_current_year[data$budget_year>=1998]
data_senti$lr_prop[data_senti$budget_year>=1998] <- data_senti$lr_prop_current_year[data_senti$budget_year>=1998]


# electoral safety measure (proportion of first preference votes on district quota)
data$safety <- data$first_preference_votes / data$quota
data_senti$safety <- data_senti$first_preference_votes / data_senti$quota

# party size
partySizeDF <- data.frame(table(data$budget_year,data$partyAbbrev))
names(partySizeDF) <- c("budget_year","partyAbbrev","partySize")
table(data$partyAbbrev,exclude = NULL)
data <- merge(data,partySizeDF,by=c("budget_year","partyAbbrev"))
data$log.party.size <- log(data$partySize)

partySizeDF <- data.frame(table(data_senti$budget_year,data_senti$partyAbbrev))
names(partySizeDF) <- c("budget_year","partyAbbrev","partySize")
table(data_senti$partyAbbrev,exclude = NULL)
data_senti <- merge(data_senti,partySizeDF,by=c("budget_year","partyAbbrev"))
data_senti$log.party.size <- log(data_senti$partySize)

# county size
countySizeDF <- data.frame(table(data$budget_year,data$county))
names(countySizeDF) <- c("budget_year","county","countySize")
data <- merge(data,countySizeDF,by=c("budget_year","county"))
data$log.county.size <- log(data$countySize)

countySizeDF <- data.frame(table(data_senti$budget_year,data_senti$county))
names(countySizeDF) <- c("budget_year","county","countySize")
data_senti <- merge(data_senti,countySizeDF,by=c("budget_year","county"))
data_senti$log.county.size <- log(data_senti$countySize)


# govt backbenchers
data$backbench <- 0
data$backbench[data$position=="Govt backbencher"] <- 1
data_senti$backbench <- 0
data_senti$backbench[data_senti$position=="Govt backbencher"] <- 1

# government dummy
data$government <- 0
data$government[data$govt=="Government"] <- 1
data_senti$government <- 0
data_senti$government[data_senti$govt=="Government"] <- 1

# economic periods ref category
data$crisis <- 0
data$crisis[data$periodEcon=="Crisis"] <- 1
data_senti$crisis <- 0
data_senti$crisis[data_senti$periodEcon=="Crisis"] <- 1

# log of numer of debate days
data$log.debate.days <- log(data$debate.days)
data_senti$log.debate.days <- log(data_senti$debate.days)


# ==================
# = Year selection =
# ==================
# remove pre-1987 years
data <- data[data$budget_year>=1987,]
data_senti <- data_senti[data_senti$budget_year>=1987,]


# ===================
# = SAVED DATA SETS =
# ===================

# save full data set
# ------------------
dataAll <- data
dataAll_senti <- data_senti

# mean-center variables
dataAll$lr_propScaled <- as.numeric(scale(dataAll$lr_prop, center=TRUE, scale=TRUE))
dataAll$safetyScaled <- as.numeric(scale(dataAll$safety, center=TRUE, scale=TRUE))
dataAll$seniorityYearsScaled <- as.numeric(scale(dataAll$seniorityYears, center=TRUE, scale=TRUE))

dataAll_senti$lr_propScaled <- as.numeric(scale(dataAll_senti$lr_prop, center=TRUE, scale=TRUE))
dataAll_senti$safetyScaled <- as.numeric(scale(dataAll_senti$safety, center=TRUE, scale=TRUE))
dataAll_senti$seniorityYearsScaled <- as.numeric(scale(dataAll_senti$seniorityYears, center=TRUE, scale=TRUE))
#dataAll$partySizeScaled <- as.numeric(scale(dataAll$partySize, center=TRUE, scale=TRUE))

# mean-center unemployment measure by year
dataAll$lr_prop_yearScaled <- NA
for (i in unique(dataAll$budget_year)) {
    dataAll$lr_prop_yearScaled[dataAll$budget_year==i] <- as.numeric(scale(dataAll$lr_prop[dataAll$budget_year==i]))
}

dataAll$unemployment <- dataAll$lr_propScaled

dataAll_senti$lr_prop_yearScaled <- NA
for (i in unique(dataAll_senti$budget_year)) {
  dataAll_senti$lr_prop_yearScaled[dataAll_senti$budget_year==i] <- as.numeric(scale(dataAll_senti$lr_prop[dataAll_senti$budget_year==i]))
}

dataAll_senti$unemployment <- dataAll_senti$lr_propScaled

# consecutive memberID
d <- data.frame(memberID=unique(dataAll$memberID),m=rank(unique(dataAll$memberID)))
dataAll <- merge(dataAll, d)

save(dataAll, file=dataOutAll)

d <- data.frame(memberID=unique(dataAll_senti$memberID),m=rank(unique(dataAll_senti$memberID)))
dataAll_senti <- merge(dataAll_senti, d)

save(dataAll_senti, file=dataOutAll_senti)


# save data set with only those TDs who spoke
# -------------------------------------------
dataSub <- data[data$spoke==1,]
dataSub_senti <- data_senti[data_senti$spoke==1,]

# mean-center variables
dataSub$lr_propScaled <- as.numeric(scale(dataSub$lr_prop, center=TRUE, scale=TRUE))
dataSub$safetyScaled <- as.numeric(scale(dataSub$safety, center=TRUE, scale=TRUE))
dataSub$seniorityYearsScaled <- as.numeric(scale(dataSub$seniorityYears, center=TRUE, scale=TRUE))
#dataSub$partySizeScaled <- as.numeric(scale(dataSub$partySize, center=TRUE, scale=TRUE))

dataSub_senti$lr_propScaled <- as.numeric(scale(dataSub_senti$lr_prop, center=TRUE, scale=TRUE))
dataSub_senti$safetyScaled <- as.numeric(scale(dataSub_senti$safety, center=TRUE, scale=TRUE))
dataSub_senti$seniorityYearsScaled <- as.numeric(scale(dataSub_senti$seniorityYears, center=TRUE, scale=TRUE))
#dataSub$partySizeScaled <- as.numeric(scale(dataSub$partySize, center=TRUE, scale=TRUE))


# mean-center unemployment measure by year
dataSub$lr_prop_yearScaled <- NA
for (i in unique(dataSub$budget_year)) {
    dataSub$lr_prop_yearScaled[dataSub$budget_year==i] <- as.numeric(scale(dataSub$lr_prop[dataSub$budget_year==i]))
}

dataSub$unemployment <- dataSub$lr_propScaled

dataSub_senti$lr_prop_yearScaled <- NA
for (i in unique(dataSub_senti$budget_year)) {
  dataSub_senti$lr_prop_yearScaled[dataSub_senti$budget_year==i] <- as.numeric(scale(dataSub_senti$lr_prop[dataSub_senti$budget_year==i]))
}

dataSub_senti$unemployment <- dataSub_senti$lr_propScaled

# consecutive memberID
d <- data.frame(memberID=unique(dataSub$memberID),m=rank(unique(dataSub$memberID)))
dataSub <- merge(dataSub, d)

d <- data.frame(memberID=unique(dataSub_senti$memberID),m=rank(unique(dataSub_senti$memberID)))
dataSub_senti <- merge(dataSub_senti, d)

save(dataSub, file=dataOutSub)
save(dataSub_senti, file=dataOutSub_senti)

